read.delim() is used to read tab seperated data. Therafter, the required columns are filtered and first column is used as a rownames. The dataset is scaled to make it ready for plotting heatmap.
library(plotly)
library(seriation)
library(GGally)
library(tidyverse)
pricesnearn <-read.delim("prices-and-earnings.txt",stringsAsFactors = FALSE)
pricesnearn <- pricesnearn[,c(1,2,5,6,7,9,10,16,17,18,19)]
row.names(pricesnearn) <- pricesnearn$City
scaledprices <- scale(pricesnearn[, c(-1)])
Without reordering, very small cluster is to be seen and difficult to see outliers.
m <- list(
l = 10,
r = 10,
b = 50,
pad = 2
)
heatmapp<-function(df,str) {
plot_ly(width=900, height=1200,
x = colnames(df),
y = rownames(df),
z = df,
type = "heatmap",
colors = colorRamp(c("yellow", "red")),
colorbar = list(title = "Color-Scale")
) %>% layout(title=str,margin=m)
}
heatmapp(scaledprices,"Heatmap of Countries")
On comparing two heatmap generated based on Euclidean Distance and 1-Corr which compute orders that optimizes Hamiltonian Path Length and use Hierarchical Clustering (HC) as the optimization algorithm, heatmap generated by Euclidean distance seems to have better clustering. Even though the permutation of rows is same in both method but there is difference in how columns are arranged. Using Euclidean method, more similar clusters are visible between countries because the gradient of color is less.
Using 1- Corr the similar clusters are visible for Bread, Big Mac and iphone prices among various countries around central left part of the map while with Euclidean distance more similar clusters are visible around top 1/4 of the map, on left central and bottom right of the map.
dist_cols<-dist(t(scaledprices),method="euclidean",diag=FALSE)
dist_rows<-dist(scaledprices,method="euclidean",diag=FALSE)
order1_euc<-seriate(dist_rows,"OLO")
order2<-seriate(dist_cols,"OLO")
ord1<-get_order(order1_euc)
ord2<-get_order(order2)
scaledprices_euc <- scaledprices[rev(ord1),ord2]
heatmapp(scaledprices_euc,"Heatmap using Euclidean Distances")
corr_rows<-1-cor(t(scaledprices))
corr_cols<-1-cor(scaledprices)
order1<-seriate(as.dist(corr_rows),"OLO")
order2<-seriate(as.dist(corr_cols),"OLO")
ord1<-get_order(order1)
ord2<-get_order(order2)
scaledprices_corr <- scaledprices[ord1,ord2]
heatmapp(scaledprices_corr,"Heatmap using 1-Correlation as Distances")
Heatmap of the dataset is generated by permutation of rows and columns using Hamiltonian Path Length which uses TSP as a optimizer. The rows and column in this heatpmap are arranged differently from heatmap generated by HC. Additonally, the clusters arranged by TSP are more similar than HC which is visually evident.
order1_tsp<-seriate(dist_rows,"TSP")
order2<-seriate(dist_cols,"TSP")
ord1<-get_order(order1_tsp)
ord2<-get_order(order2)
TSP_solv<-criterion(dist_rows,order1,method=c("Path_Length","Gradient_raw"))
scaledprices_euc <- scaledprices[rev(ord1),ord2]
heatmapp(scaledprices_euc,"Heatmap-TSP using Hamiltonian Path Length")
Here we compare how efficient is TSP against Hierarchical clustering as an optimizer for clustering on basis of different loss/merit function.
## HC TSP
## Path_length 121.9671 122.9134
## Gradient_weighted 156151.0604 99871.6895
## Gradient_raw 65528.0000 43048.0000
Here there are two prominent cluster which can be identified based on variable “iphone” having value greater than 56 or lesser than 56. Clusters can be colored using red and blue .
Visually the clusters becomes much clearer if variables are arranged in following order from left to right-Big Mac,Bread,iphone4S,Rice(in kg),Hours Worked,Food Cost,Clothing Index,Wage Net,Goods and Services , Vacation.
pricesnearn$PhoneColor<-ifelse(pricesnearn$iPhone.4S.hr. > 56 ,1 ,0)
p <- pricesnearn %>% plot_ly(width=1000,height=600,type = "parcoords",
line = list(color = ~PhoneColor,
colorscale = list(c(0,'red'),c(1,'blue'))),
dimensions = list(
list(label = "Food Cost", values = ~ Food.Costs...),
list(label = "Iphone4S", values = ~ iPhone.4S.hr.),
list(label = "Clothing Index", values = ~ Clothing.Index),
list(label = "Hours Worked", values = ~ Hours.Worked),
list(label = "Wage Net", values = ~ Wage.Net),
list(label = "Vacation Days", values = ~ Vacation.Days),
list(label = "Big Mac", values = ~ Big.Mac.min.),
list(label = "Bread(in kg)", values = ~ Bread.kg.in.min.),
list(label = "Rice(in kg)", values = ~ Rice.kg.in.min.),
list(label = "Goods and Services", values = ~Goods.and.Services...)
)) %>% layout(title="Parallel Plot")
p
library(scales)
Ps=list()
scaledprices_euc <- as.data.frame(scaledprices_euc)
nPlot<- nrow(scaledprices_euc)
scaledprices_euc%>% add_rownames(var="City") -> scaledprices_radar
for ( i in 1:nPlot) {
Ps[[i]]<-htmltools::tags$div(
plot_ly(type="scatterpolar",
r=as.numeric(scaledprices_radar[i,-1]),
theta= colnames(scaledprices_radar)[-1],
fill="toself") %>%
layout(title=scaledprices_radar$City[i]),style="width: 25%;")
}
h<-htmltools::tags$div(style="display: flex; flex-wrap: wrap; ",Ps)
htmltools::browsable(h)
First cluster includes Tokyo, Seoul, Hongkong and Toronto. On stacking one above another, it can be seen that variables of Tokyo are not similar with respect to another country . Food cost, Clothing Index and Goods and services can be considered as outlier for Tokyo with respect to other country .
Second cluster includes Chicago ,Los Angeles, Miami abd Tel Aviv. “Rice in kg” variable for Tel Aviv can be considered as outlier.
plot_ly(
type = 'scatterpolar',
fill="toself"
)%>%add_trace(
r=as.numeric(scaledprices_corr["Tokyo",-11]),
theta= colnames(scaledprices_corr)[-11],
name="Tokyo") %>%
add_trace(
r=as.numeric(scaledprices_corr["Seoul",-11]),
theta= colnames(scaledprices_corr)[-11],
name="Seoul") %>%
add_trace(
r=as.numeric(scaledprices_corr["Hong Kong",-11]),
theta= colnames(scaledprices_corr)[-11],
name="Hong Kong") %>%
add_trace(
r=as.numeric(scaledprices_corr["Toronto",-11]),
theta= colnames(scaledprices_corr)[-11],
name="Toronto")
plot_ly(
type = 'scatterpolar',
fill="toself"
)%>%add_trace(
r=as.numeric(scaledprices_corr["Chicago",-11]),
theta= colnames(scaledprices_corr)[-11],
name="Chicago") %>%
add_trace(
r=as.numeric(scaledprices_corr["Los Angeles",-11]),
theta= colnames(scaledprices_corr)[-11],
name="Los Angeles") %>%
add_trace(
r=as.numeric(scaledprices_corr["Miami",-11]),
theta= colnames(scaledprices_corr)[-11],
name="Miami") %>%
add_trace(
r=as.numeric(scaledprices_corr["Tel Aviv",-11]),
theta= colnames(scaledprices_corr)[-11],
name="Tel Aviv")
Among heatmaps,parallel coordinate and radar charts, heatmaps seems to be best at analysing the similarity of variable and objects due to visible gradient in color. With parallel plots it becomes difficult to trace all lines and some times it might become difficult to see pattern. With Radar chart it might become difficult to judge orientation.Radar chart and parallel plots are good at identifying outliers.
population1994<-read.csv('adult.csv')
Analysis
In a scatterplot, the data is cluttered so it is difficult to analyze the income levels <=50k & >50k. But in Trellis plot the data is visible in two panels based on the income levels are <=50k & >50k ,So it is easier to analyze the data.In the income level<=50k the Age between 25-50 is more population working 35-50 hours per week.In the income level >50k the Age between 25-50 is more population working 40 -50 hours per week.
# Scatter plot
hour_age <- population1994 %>%
select(X40,X39,X..50K)
ggplot(hour_age, aes(x=X40, y=X39 ,color = X..50K)) + geom_point(size =1)+ggtitle("Hours per Week, Age, & Incomelevel")+labs(x="Hours per week",y="Age")
#Trellis plot
ggplot(hour_age, aes(x=X40, y=X39, color=X..50K))+geom_point(size =1)+ggtitle("Hours per Week, Age, & Incomelevel")+labs(x="Hours per week",y="Age")+
facet_grid(X..50K~.)
Analysis
In ggplot, the income level is grouped by age and we can analyze the density of data is high in 35-50 Age in the income level >50k and in the income level <=50k the density of data is high in Age 15-30.
In Trellis plot the income level is grouped by age and divided into seven panels :
Divorced:- In this category, people’s whose income level <=50k is more in Age group 30-50 and income level >50k is more in Age group 40-50.
Married-AF-spouse:- In this category, people’s whose income level <=50k is more in Age group 30-40 and income level >50k is more in Age group 25-40.
Married-civ-spouse:- In this category, people’s whose income level <=50k is more in Age group 27-30 and income level >50k is more in Age group 35-50.
Married-spouse-absent:- In this category, people’s whose income level <=50k is more in Age group 28-35 and income level >50k is more in Age group 45-50.
Never-Married:- In this category, people’s whose income level <=50k is more in Age group 0-25 and income level >50k is more in Age group 35-40.
Separated:- In this category, people’s whose income level <=50k is more in Age group 26-30 and income level >50k is more in Age group 30-45.
Widowed:- In this category, people’s whose income level <=50k is more in Age group 58-60 and income level >50k is more in Age group 53-65.
#ggpplot
Age_income<-population1994%>%
select(X39,X..50K,Never.married)
ggplot(Age_income, aes(x=Age_income$X39, group=Age_income$X..50K,fill=X..50K )) + ggtitle("Age grouped by the Income level")+geom_density(alpha= 0.3) + xlab("AGE")+ ylab("Income level")
#Trellis plot
ggplot(Age_income, aes(x=X39, group=X..50K, colors = X..50K, fill= X..50K)) + ggtitle("Age grouped by the Income level")+geom_density(alpha = 0.3) +xlab("AGE") + ylab("Income level")+facet_wrap(Never.married~.)
Analysis
In 3D scatter plot, the quantity of data is high and overplotted, so it is difficult to analyze the data on values. In 2d density- raster type plot in each panel we can readily notice the capital loss based on Age and Education-num. In panels we can analyze the data, For example, we can see the panel 4(from top) the age group of 41-46, have Capital loss of 2000 in Education_num between 9-15 and in 1st-panel the age group of 17-26 have Capital loss 1500 in Education_num between 9-13,so that we can compare the panels and easy to analyze the data in Trellis plot.In the 3d scatter plot we cannot analyze the data as we did in trellis plot.
#3d Scatter plot
c<-population1994%>%filter(X0 != 0)
p<-plot_ly(c, x =~X13,y=~X39, z=~X0)%>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Education-num'),
yaxis = list(title = 'Age'),
zaxis = list(title = 'Capital loss')))
p
#Trellis plot
Age<-c$X39
Age<-as.vector(Age)
c$Agerange<-cut_number(Age,n=6)
p<-ggplot(c,aes(x = X13, y= X0)) + stat_density_2d(geom = "raster", aes(fill = ..density..), contour = FALSE) + facet_grid(Agerange~.)+ggtitle("Capital loss vs Education_num-Trellis plot")+xlab("Education_num") + ylab("Captial loss")
q<-ggplotly()
q
Analysis
In the first plot, we used facet grid without shingles these leads to loss of data between the panels. In the second plot we used facet grid with shingles and overlap 10 %, so there is no loss of data between panels and this is the advantage of the shingles. But first time if we see the plots we cannot find any difference between plots because the loss of data is small in number, but if you observe clearly we can see lost of data between plots. The disadvantage is when we use shingles the lost amount of data is small in number, it does not show much difference of using shingles. The advantageof shingles is that we will not miss trends in data because of overlap.
c<-population1994%>%filter(X0 != 0)
g<-ggplot(data = c, aes(x=X13,y=X0,color = X0)) +
geom_point() +
facet_grid(cut_number(X39,n=4)~.)+
ggtitle("Capital Loss versus Education-num & Age")+xlab("Education_num") + ylab("Captial loss")
r<-ggplotly()
r
Age_Range<-lattice::equal.count(Age, number=4, overlap=0.1)
L<-matrix(unlist(levels(Age_Range)), ncol=2, byrow = T)
L1<-data.frame(Lower=L[,1],Upper=L[,2], Interval=factor(1:nrow(L)))
index=c()
Class=c()
for(i in 1:nrow(L)){
Cl=paste("[", L1$Lower[i], ",", L1$Upper[i], "]", sep="")
Cl=paste("[", L1$Lower[i], ",", L1$Upper[i], "]", sep="")
ind=which(c$X39>=L1$Lower[i] &c$X39<=L1$Upper[i])
index=c(index,ind)
Class=c(Class, rep(Cl, length(ind)))
}
data1994<-c[index,]
data1994$Class<-as.factor(Class)
p<-ggplot(data = data1994, aes(x=X13, y=X0, color = X0)) +
geom_point() + facet_grid(Class~.) +
ggtitle("Education vs. Capital Loss using Shingles")+xlab("Education_num") + ylab("Captial loss")
q<-ggplotly()
q
library(plotly)
library(seriation)
library(GGally)
library(tidyverse)
library(scales)
pricesnearn <-read.delim("prices-and-earnings.txt",stringsAsFactors = FALSE)
pricesnearn <- pricesnearn[,c(1,2,5,6,7,9,10,16,17,18,19)]
row.names(pricesnearn) <- pricesnearn$City
scaledprices <- scale(pricesnearn[, c(-1)])
m <- list(
l = 10,
r = 10,
b = 50,
pad = 2
)
heatmapp<-function(df,str) {
plot_ly(width=900, height=1200,
x = colnames(df),
y = rownames(df),
z = df,
type = "heatmap",
colors = colorRamp(c("yellow", "red")),
colorbar = list(title = "Color-Scale")
) %>% layout(title=str,margin=m)
}
heatmapp(scaledprices,"Heatmap of Countries")
dist_cols<-dist(t(scaledprices),method="euclidean",diag=FALSE)
dist_rows<-dist(scaledprices,method="euclidean",diag=FALSE)
order1_euc<-seriate(dist_rows,"OLO")
order2<-seriate(dist_cols,"OLO")
ord1<-get_order(order1_euc)
ord2<-get_order(order2)
scaledprices_euc <- scaledprices[rev(ord1),ord2]
heatmapp(scaledprices_euc,"Heatmap using Euclidean Distances")
corr_rows<-1-cor(t(scaledprices))
corr_cols<-1-cor(scaledprices)
order1<-seriate(as.dist(corr_rows),"OLO")
order2<-seriate(as.dist(corr_cols),"OLO")
ord1<-get_order(order1)
ord2<-get_order(order2)
scaledprices_corr <- scaledprices[ord1,ord2]
heatmapp(scaledprices_corr,"Heatmap using 1-Correlation as Distances")
order1_tsp<-seriate(dist_rows,"TSP")
order2<-seriate(dist_cols,"TSP")
ord1<-get_order(order1_tsp)
ord2<-get_order(order2)
TSP_solv<-criterion(dist_rows,order1,method=c("Path_Length","Gradient_raw"))
scaledprices_euc <- scaledprices[rev(ord1),ord2]
heatmapp(scaledprices_euc,"Heatmap-TSP using Hamiltonian Path Length")
cbind(HC=criterion(dist_rows,order1_euc,method=c("Path_Length","Gradient_Weighted","Gradient_raw")), TSP=criterion(dist_rows,order1_tsp,method=c("Path_Length","Gradient_Weighted","Gradient_raw")))
pricesnearn$PhoneColor<-ifelse(pricesnearn$iPhone.4S.hr. > 56 ,1 ,0)
p <- pricesnearn %>% plot_ly(width=1000,height=600,type = "parcoords",
line = list(color = ~PhoneColor,
colorscale = list(c(0,'red'),c(1,'blue'))),
dimensions = list(
list(label = "Food Cost", values = ~ Food.Costs...),
list(label = "Iphone4S", values = ~ iPhone.4S.hr.),
list(label = "Clothing Index", values = ~ Clothing.Index),
list(label = "Hours Worked", values = ~ Hours.Worked),
list(label = "Wage Net", values = ~ Wage.Net),
list(label = "Vacation Days", values = ~ Vacation.Days),
list(label = "Big Mac", values = ~ Big.Mac.min.),
list(label = "Bread(in kg)", values = ~ Bread.kg.in.min.),
list(label = "Rice(in kg)", values = ~ Rice.kg.in.min.),
list(label = "Goods and Services", values = ~Goods.and.Services...)
)) %>% layout(title="Parallel Plot")
p
Ps=list()
scaledprices_euc <- as.data.frame(scaledprices_euc)
nPlot<- nrow(scaledprices_euc)
scaledprices_euc%>% add_rownames(var="City") -> scaledprices_radar
for ( i in 1:nPlot) {
Ps[[i]]<-htmltools::tags$div(
plot_ly(type="scatterpolar",
r=as.numeric(scaledprices_radar[i,-1]),
theta= colnames(scaledprices_radar)[-1],
fill="toself") %>%
layout(title=scaledprices_radar$City[i]),style="width: 25%;")
}
h<-htmltools::tags$div(style="display: flex; flex-wrap: wrap; ",Ps)
htmltools::browsable(h)
plot_ly(
type = 'scatterpolar',
fill="toself"
)%>%add_trace(
r=as.numeric(scaledprices_corr["Tokyo",-11]),
theta= colnames(scaledprices_corr)[-11],
name="Tokyo") %>%
add_trace(
r=as.numeric(scaledprices_corr["Seoul",-11]),
theta= colnames(scaledprices_corr)[-11],
name="Seoul") %>%
add_trace(
r=as.numeric(scaledprices_corr["Hong Kong",-11]),
theta= colnames(scaledprices_corr)[-11],
name="Hong Kong") %>%
add_trace(
r=as.numeric(scaledprices_corr["Toronto",-11]),
theta= colnames(scaledprices_corr)[-11],
name="Toronto")
plot_ly(
type = 'scatterpolar',
fill="toself"
)%>%add_trace(
r=as.numeric(scaledprices_corr["Chicago",-11]),
theta= colnames(scaledprices_corr)[-11],
name="Chicago") %>%
add_trace(
r=as.numeric(scaledprices_corr["Los Angeles",-11]),
theta= colnames(scaledprices_corr)[-11],
name="Los Angeles") %>%
add_trace(
r=as.numeric(scaledprices_corr["Miami",-11]),
theta= colnames(scaledprices_corr)[-11],
name="Miami") %>%
add_trace(
r=as.numeric(scaledprices_corr["Tel Aviv",-11]),
theta= colnames(scaledprices_corr)[-11],
name="Tel Aviv")
population1994<-read.csv('adult.csv')
# Scatter plot
hour_age <- population1994 %>%
select(X40,X39,X..50K)
ggplot(hour_age, aes(x=X40, y=X39 ,color = X..50K)) + geom_point(size =1)+ggtitle("Hours per Week, Age, & Incomelevel")+labs(x="Hours per week",y="Age")
#Trellis plot
ggplot(hour_age, aes(x=X40, y=X39, color=X..50K))+geom_point(size =1)+ggtitle("Hours per Week, Age, & Incomelevel")+labs(x="Hours per week",y="Age")+
facet_grid(X..50K~.)
#ggpplot
Age_income<-population1994%>%
select(X39,X..50K,Never.married)
ggplot(Age_income, aes(x=Age_income$X39, group=Age_income$X..50K,fill=X..50K )) + ggtitle("Age grouped by the Income level")+geom_density(alpha= 0.3) + xlab("AGE")+ ylab("Income level")
#Trellis plot
ggplot(Age_income, aes(x=X39, group=X..50K, colors = X..50K, fill= X..50K)) + ggtitle("Age grouped by the Income level")+geom_density(alpha = 0.3) +xlab("AGE") + ylab("Income level")+facet_wrap(Never.married~.)
#3d Scatter plot
c<-population1994%>%filter(X0 != 0)
p<-plot_ly(c, x =~X13,y=~X39, z=~X0)%>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Education-num'),
yaxis = list(title = 'Age'),
zaxis = list(title = 'Capital loss')))
p
#Trellis plot
Age<-c$X39
Age<-as.vector(Age)
c$Agerange<-cut_number(Age,n=6)
p<-ggplot(c,aes(x = X13, y= X0)) + stat_density_2d(geom = "raster", aes(fill = ..density..), contour = FALSE) + facet_grid(Agerange~.)+ggtitle("Capital loss vs Education_num-Trellis plot")+xlab("Education_num") + ylab("Captial loss")
q<-ggplotly()
q
c<-population1994%>%filter(X0 != 0)
g<-ggplot(data = c, aes(x=X13,y=X0,color = X0)) +
geom_point() +
facet_grid(cut_number(X39,n=4)~.)+
ggtitle("Capital Loss versus Education-num & Age")+xlab("Education_num") + ylab("Captial loss")
r<-ggplotly()
r
Age_Range<-lattice::equal.count(Age, number=4, overlap=0.1)
L<-matrix(unlist(levels(Age_Range)), ncol=2, byrow = T)
L1<-data.frame(Lower=L[,1],Upper=L[,2], Interval=factor(1:nrow(L)))
index=c()
Class=c()
for(i in 1:nrow(L)){
Cl=paste("[", L1$Lower[i], ",", L1$Upper[i], "]", sep="")
Cl=paste("[", L1$Lower[i], ",", L1$Upper[i], "]", sep="")
ind=which(c$X39>=L1$Lower[i] &c$X39<=L1$Upper[i])
index=c(index,ind)
Class=c(Class, rep(Cl, length(ind)))
}
data1994<-c[index,]
data1994$Class<-as.factor(Class)
p<-ggplot(data = data1994, aes(x=X13, y=X0, color = X0)) +
geom_point() + facet_grid(Class~.) +
ggtitle("Education vs. Capital Loss using Shingles")+xlab("Education_num") + ylab("Captial loss")
q<-ggplotly()
q